In [1]:
! python -m spacy download en_core_web_sm
! pip install sentence-transformers umap-learn bertopic
! pip install spacy
! pip install openpyxl
! pip install hdbscan
! pip install datasets
! pip install transformers torch
! pip install --upgrade sentence_transformers umap-learn hdbscan bertopic
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 2.6 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: numpy>=1.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: setuptools in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.6.3)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (22.0)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (5.2.1)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: jinja2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.28.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.64.1)
Requirement already satisfied: pydantic-core==2.18.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.1)
Requirement already satisfied: typing-extensions>=4.6.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.9.0)
Requirement already satisfied: annotated-types>=0.4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.4)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.1)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Requirement already satisfied: sentence-transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.7.0)
Requirement already satisfied: umap-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.5.6)
Requirement already satisfied: bertopic in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.16.2)
Requirement already satisfied: Pillow in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (9.4.0)
Requirement already satisfied: scipy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.10.0)
Requirement already satisfied: numpy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.26.4)
Requirement already satisfied: tqdm in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (4.64.1)
Requirement already satisfied: huggingface-hub>=0.15.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (0.22.2)
Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (4.39.3)
Requirement already satisfied: torch>=1.11.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.12.1)
Requirement already satisfied: scikit-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.4.2)
Requirement already satisfied: pynndescent>=0.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.5.12)
Requirement already satisfied: numba>=0.51.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.59.1)
Requirement already satisfied: plotly>=4.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: pandas>=1.1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (2.0.1)
Requirement already satisfied: hdbscan>=0.8.29 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (0.8.33)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan>=0.8.29->bertopic) (0.29.37)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan>=0.8.29->bertopic) (1.2.0)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (6.0)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2.28.1)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (4.9.0)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (3.9.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2024.3.1)
Requirement already satisfied: packaging>=20.9 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (22.0)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from numba>=0.51.2->umap-learn) (0.42.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2022.7)
Requirement already satisfied: tenacity>=6.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from plotly>=4.7.0->bertopic) (8.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (2.2.0)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (0.15.2)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (0.4.3)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (2022.7.9)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (1.26.14)
Requirement already satisfied: spacy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (3.7.4)
Requirement already satisfied: jinja2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.1.2)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (5.2.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.3.0)
Requirement already satisfied: setuptools in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (65.6.3)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (0.3.4)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.0.8)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (8.2.3)
Requirement already satisfied: numpy>=1.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.26.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.0.12)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.4.8)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (4.64.1)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.7.0)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (0.9.4)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.1.2)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (22.0)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.0.10)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.0.10)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.28.1)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.0.9)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.0.5)
Requirement already satisfied: pydantic-core==2.18.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.1)
Requirement already satisfied: typing-extensions>=4.6.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.9.0)
Requirement already satisfied: annotated-types>=0.4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from jinja2->spacy) (2.1.1)
Requirement already satisfied: openpyxl in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (3.0.10)
Requirement already satisfied: et_xmlfile in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from openpyxl) (1.1.0)
Requirement already satisfied: hdbscan in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.8.33)
Requirement already satisfied: scikit-learn>=0.20 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.4.2)
Requirement already satisfied: numpy>=1.20 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.26.4)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (0.29.37)
Requirement already satisfied: scipy>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.10.0)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn>=0.20->hdbscan) (2.2.0)
Requirement already satisfied: datasets in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.19.1)
Requirement already satisfied: numpy>=1.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (1.26.4)
Requirement already satisfied: requests>=2.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2.28.1)
Requirement already satisfied: multiprocess in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.70.16)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (6.0)
Requirement already satisfied: pyarrow>=12.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (16.0.0)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.9.0)
Requirement already satisfied: xxhash in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.4.1)
Requirement already satisfied: huggingface-hub>=0.21.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.22.2)
Requirement already satisfied: pandas in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2.0.1)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.3.8)
Requirement already satisfied: packaging in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (22.0)
Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2024.3.1)
Requirement already satisfied: tqdm>=4.62.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (4.64.1)
Requirement already satisfied: pyarrow-hotfix in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.6)
Requirement already satisfied: aiohttp in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.9.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.3)
Requirement already satisfied: multidict<7.0,>=4.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.5)
Requirement already satisfied: aiosignal>=1.1.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)
Requirement already satisfied: frozenlist>=1.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.4.1)
Requirement already satisfied: attrs>=17.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (23.2.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.21.2->datasets) (4.9.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2023.5.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.14)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.4)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2023.3)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)
Requirement already satisfied: transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (4.39.3)
Requirement already satisfied: torch in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (1.12.1)
Requirement already satisfied: tqdm>=4.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (4.64.1)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.15.2)
Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.22.2)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (6.0)
Requirement already satisfied: numpy>=1.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (1.26.4)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (2.28.1)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.4.3)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (2022.7.9)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (3.9.0)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (22.0)
Requirement already satisfied: typing_extensions in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from torch) (4.9.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (2024.3.1)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (1.26.14)
Requirement already satisfied: sentence_transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.7.0)
Requirement already satisfied: umap-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.5.6)
Requirement already satisfied: hdbscan in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.8.33)
Requirement already satisfied: bertopic in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.16.2)
Requirement already satisfied: Pillow in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (9.4.0)
Requirement already satisfied: torch>=1.11.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.12.1)
Requirement already satisfied: numpy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.26.4)
Requirement already satisfied: scikit-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.4.2)
Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.39.3)
Requirement already satisfied: huggingface-hub>=0.15.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (0.22.2)
Requirement already satisfied: scipy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.10.0)
Requirement already satisfied: tqdm in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.64.1)
Requirement already satisfied: pynndescent>=0.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.5.12)
Requirement already satisfied: numba>=0.51.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.59.1)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (0.29.37)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: plotly>=4.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: pandas>=1.1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (2.0.1)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.9.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.9.0)
Requirement already satisfied: packaging>=20.9 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (22.0)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2024.3.1)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.28.1)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from numba>=0.51.2->umap-learn) (0.42.0)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from plotly>=4.7.0->bertopic) (8.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (2.2.0)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2022.7.9)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.15.2)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.3)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.14)
In [2]:
import os
import pandas as pd
import openpyxl
from datetime import datetime
import matplotlib.pyplot as plt
from textblob import TextBlob
import ast
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter
from bertopic import BERTopic
import random
import numpy as np
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.cm as cm
from collections import Counter
from umap import UMAP
from wordcloud import WordCloud
import seaborn as sns
from scipy.stats import f_oneway
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from gensim import corpora, models
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, fcluster
In [3]:
# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model
nltk.download('stopwords')  # Stopwords list
nltk.download('wordnet')  # Lexical database for lemmatization
nltk.download('omw-1.4')  # Open Multilingual Wordnet, needed for lemmatization in multiple languages
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/helgegeurtjacobusmoes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/helgegeurtjacobusmoes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/helgegeurtjacobusmoes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/helgegeurtjacobusmoes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Out[3]:
True
In [4]:
# Load the Excel file into a DataFrame
excel_path = '/Users/helgegeurtjacobusmoes/Desktop/thesis data/Updated_Merged_Data.xlsx'

# Load the Excel file into a DataFrame
updated_merged_data = pd.read_excel(excel_path)

updated_merged_data
Out[4]:
Headline Publication URL News Outlet Type of News Word Count Body Publication Date
0 Nee, kunstmatige intelligentie gaat ons niet u... Trouw, Verdieping; Blz. 4, 5, 2044 words https://advance.lexis.com/api/document?collect... Trouw Verdieping 2044 Welkom in de AI-fabriek serie\nDat kunstmatige... 7 december 2023 donderdag
1 Wereldleiders zoeken grip op kunstmatige intel... Trouw, Vandaag; Blz. 6, 528 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 528 Op het Britse landgoed Bletchley Park werden t... 3 november 2023 vrijdag
2 Kunstmatige intelligentie is best bedreigend Trouw, Tijdgeest; Blz. 8, 576 words https://advance.lexis.com/api/document?collect... Trouw Tijdgeest 576 Of kunstmatige intelligentie nuttig is (Tijdge... 13 mei 2023 zaterdag
3 Mensen zijn een stuk efficiënter dan kunstmati... Trouw, Vandaag; Blz. 3, 741 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 741 De wereld raakte het afgelopen jaar in de ban ... 21 oktober 2023 zaterdag
4 Bedreigt kunstmatige intelligentie ons godsbeeld? Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... https://advance.lexis.com/api/document?collect... Trouw Religie en Filosofie 1367 Theologisch elftal\n'In het begin was het Woor... 16 december 2022 vrijdag
... ... ... ... ... ... ... ... ...
6441 De rauwe realiteit Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad MORGEN 2920 Grootse oplossingen\nDrie stedelijke 'ontwrich... 14 oktober 2017 zaterdag 12:00 AM GMT
6442 No Headline In Original Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad PAGINA 114 klinkt als muziek\nDe Walkman, van Sony, is vo... 29 april 2023 zaterdag 12:00 AM GMT
6443 Groeten uit het hart van de hightech Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad WEEKEND 2799 Het is zover voor 'onze man in San Francisco'.... 20 augustus 2016 zaterdag 12:00 AM GMT
6444 De complete lijst Jonge Talenten 2019 Het Financieele Dagblad, FD PERSOONLIJK; Arbei... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad FD PERSOONLIJK; Arbeidsmarkt 8007 Rebel werkte zes jaar bij zakenbank Morgan Sta... 17 januari 2019 donderdag 1:00 PM GMT
6445 No Headline In Original Het Financieele Dagblad, DE WERELD; Blz. 30, 9... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad DE WERELD 969 The Conversation (Londen)Gates Notes (VS)The E... 8 december 2018 zaterdag 12:00 AM GMT

6446 rows × 8 columns

In [5]:
# Load Dutch stopwords
stop_words_nl = set(stopwords.words('dutch'))

# Define a preprocess_text function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    words = word_tokenize(text)
    # Remove numbers
    words = [re.sub(r'\d+', '', word) for word in words]
    # Remove punctuation and special characters
    words = [word for word in words if word.isalnum()]
    # Remove stopwords
    stop_words = set(stopwords.words('Dutch'))
    stop_words.update(['No Headline In Original', 'trouw', 'volkskrant', 'financieele', 'algemeen', 'dagblad', 'nrc', 'telegraaf'])
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join words back to text
    text = ' '.join(words)
    return text

# Create the Combined column and preprocess it
updated_merged_data["Combined"] = updated_merged_data["Headline"].fillna("") + " " + updated_merged_data["Body"].fillna("")
updated_merged_data["Combined"] = updated_merged_data["Combined"].apply(preprocess_text)

# Ensure there are no NaN values in the Combined column
updated_merged_data.dropna(subset=["Combined"], inplace=True)
updated_merged_data.reset_index(drop=True, inplace=True)

updated_merged_data  
Out[5]:
Headline Publication URL News Outlet Type of News Word Count Body Publication Date Combined
0 Nee, kunstmatige intelligentie gaat ons niet u... Trouw, Verdieping; Blz. 4, 5, 2044 words https://advance.lexis.com/api/document?collect... Trouw Verdieping 2044 Welkom in de AI-fabriek serie\nDat kunstmatige... 7 december 2023 donderdag nee kunstmatige intelligentie gaat uitroeien w...
1 Wereldleiders zoeken grip op kunstmatige intel... Trouw, Vandaag; Blz. 6, 528 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 528 Op het Britse landgoed Bletchley Park werden t... 3 november 2023 vrijdag wereldleiders zoeken grip kunstmatige intellig...
2 Kunstmatige intelligentie is best bedreigend Trouw, Tijdgeest; Blz. 8, 576 words https://advance.lexis.com/api/document?collect... Trouw Tijdgeest 576 Of kunstmatige intelligentie nuttig is (Tijdge... 13 mei 2023 zaterdag kunstmatige intelligentie best bedreigend kuns...
3 Mensen zijn een stuk efficiënter dan kunstmati... Trouw, Vandaag; Blz. 3, 741 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 741 De wereld raakte het afgelopen jaar in de ban ... 21 oktober 2023 zaterdag mensen stuk efficiënter kunstmatige intelligen...
4 Bedreigt kunstmatige intelligentie ons godsbeeld? Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... https://advance.lexis.com/api/document?collect... Trouw Religie en Filosofie 1367 Theologisch elftal\n'In het begin was het Woor... 16 december 2022 vrijdag bedreigt kunstmatige intelligentie godsbeeld t...
... ... ... ... ... ... ... ... ... ...
6441 De rauwe realiteit Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad MORGEN 2920 Grootse oplossingen\nDrie stedelijke 'ontwrich... 14 oktober 2017 zaterdag 12:00 AM GMT rauwe realiteit grootse oplossingen drie stede...
6442 No Headline In Original Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad PAGINA 114 klinkt als muziek\nDe Walkman, van Sony, is vo... 29 april 2023 zaterdag 12:00 AM GMT no headline original klinkt muziek walkman son...
6443 Groeten uit het hart van de hightech Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad WEEKEND 2799 Het is zover voor 'onze man in San Francisco'.... 20 augustus 2016 zaterdag 12:00 AM GMT groeten hart hightech zover man san francisco ...
6444 De complete lijst Jonge Talenten 2019 Het Financieele Dagblad, FD PERSOONLIJK; Arbei... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad FD PERSOONLIJK; Arbeidsmarkt 8007 Rebel werkte zes jaar bij zakenbank Morgan Sta... 17 januari 2019 donderdag 1:00 PM GMT complete lijst jonge talenten rebel werkte z j...
6445 No Headline In Original Het Financieele Dagblad, DE WERELD; Blz. 30, 9... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad DE WERELD 969 The Conversation (Londen)Gates Notes (VS)The E... 8 december 2018 zaterdag 12:00 AM GMT no headline original the conversation londen g...

6446 rows × 9 columns

In [6]:
# Dictionary for Dutch to English month translation
dutch_months = {
    "januari": "January", "februari": "February", "maart": "March",
    "april": "April", "mei": "May", "juni": "June",
    "juli": "July", "augustus": "August", "september": "September",
    "oktober": "October", "november": "November", "december": "December"
}

# Function to translate Dutch month names to English and format the date
def translate_date(date_str):
    if pd.isna(date_str):
        return None  # Return None if the date is NaN
    try:
        parts = date_str.split()
        if len(parts) >= 3:
            day = parts[0].zfill(2)  # Ensure day is two digits
            month_dutch = parts[1].lower()
            year = parts[2]
            month_english = dutch_months.get(month_dutch, None)
            if not month_english:
                return None  # Return None if the month is not found
            date_str_english = f"{day}-{month_english}-{year}"
            date_obj = datetime.strptime(date_str_english, "%d-%B-%Y")
            return date_obj.strftime("%d-%m-%Y")
    except Exception as e:
        print(f"Error parsing date '{date_str}': {e}")
        return None

# Apply the translation and conversion function to the 'Publication Date' column
updated_merged_data['Publication Date'] = updated_merged_data['Publication Date'].apply(translate_date)

# Print out some of the cleaned data to verify
updated_merged_data
Out[6]:
Headline Publication URL News Outlet Type of News Word Count Body Publication Date Combined
0 Nee, kunstmatige intelligentie gaat ons niet u... Trouw, Verdieping; Blz. 4, 5, 2044 words https://advance.lexis.com/api/document?collect... Trouw Verdieping 2044 Welkom in de AI-fabriek serie\nDat kunstmatige... 07-12-2023 nee kunstmatige intelligentie gaat uitroeien w...
1 Wereldleiders zoeken grip op kunstmatige intel... Trouw, Vandaag; Blz. 6, 528 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 528 Op het Britse landgoed Bletchley Park werden t... 03-11-2023 wereldleiders zoeken grip kunstmatige intellig...
2 Kunstmatige intelligentie is best bedreigend Trouw, Tijdgeest; Blz. 8, 576 words https://advance.lexis.com/api/document?collect... Trouw Tijdgeest 576 Of kunstmatige intelligentie nuttig is (Tijdge... 13-05-2023 kunstmatige intelligentie best bedreigend kuns...
3 Mensen zijn een stuk efficiënter dan kunstmati... Trouw, Vandaag; Blz. 3, 741 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 741 De wereld raakte het afgelopen jaar in de ban ... 21-10-2023 mensen stuk efficiënter kunstmatige intelligen...
4 Bedreigt kunstmatige intelligentie ons godsbeeld? Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... https://advance.lexis.com/api/document?collect... Trouw Religie en Filosofie 1367 Theologisch elftal\n'In het begin was het Woor... 16-12-2022 bedreigt kunstmatige intelligentie godsbeeld t...
... ... ... ... ... ... ... ... ... ...
6441 De rauwe realiteit Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad MORGEN 2920 Grootse oplossingen\nDrie stedelijke 'ontwrich... 14-10-2017 rauwe realiteit grootse oplossingen drie stede...
6442 No Headline In Original Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad PAGINA 114 klinkt als muziek\nDe Walkman, van Sony, is vo... 29-04-2023 no headline original klinkt muziek walkman son...
6443 Groeten uit het hart van de hightech Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad WEEKEND 2799 Het is zover voor 'onze man in San Francisco'.... 20-08-2016 groeten hart hightech zover man san francisco ...
6444 De complete lijst Jonge Talenten 2019 Het Financieele Dagblad, FD PERSOONLIJK; Arbei... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad FD PERSOONLIJK; Arbeidsmarkt 8007 Rebel werkte zes jaar bij zakenbank Morgan Sta... 17-01-2019 complete lijst jonge talenten rebel werkte z j...
6445 No Headline In Original Het Financieele Dagblad, DE WERELD; Blz. 30, 9... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad DE WERELD 969 The Conversation (Londen)Gates Notes (VS)The E... 08-12-2018 no headline original the conversation londen g...

6446 rows × 9 columns

In [7]:
# Save the cleaned DataFrame to a new Excel file
updated_merged_data.to_excel("/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx", index=False)

# Save the cleaned DataFrame to a new CSV file
updated_merged_data.to_csv("/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.csv", index=False)
In [9]:
import pandas as pd
import re

# Load the data from the Excel file
file_path = "/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx"
data = pd.read_excel(file_path)

# Ensure all entries in the 'Combined' column are treated as strings
data['Combined'] = data['Combined'].astype(str)

# Count the occurrences of each word in the 'Combined' column
data['AI Count'] = data['Combined'].str.count(r'\bAI\b', flags=re.IGNORECASE)
data['Kunstmatige Intelligentie Count'] = data['Combined'].str.count(r'\bKunstmatige Intelligentie\b', flags=re.IGNORECASE)
data['Artificial Intelligence Count'] = data['Combined'].str.count(r'\bArtificial Intelligence\b', flags=re.IGNORECASE)

# Sum the counts for each word across all rows
total_ai_count = data['AI Count'].sum()
total_ki_count = data['Kunstmatige Intelligentie Count'].sum()
total_ai_full_count = data['Artificial Intelligence Count'].sum()

print(f"Total 'AI' count: {total_ai_count}")
print(f"Total 'Kunstmatige Intelligentie' count: {total_ki_count}")
print(f"Total 'Artificial Intelligence' count: {total_ai_full_count}")
Total 'AI' count: 7889
Total 'Kunstmatige Intelligentie' count: 10442
Total 'Artificial Intelligence' count: 276

LDA Topic Modelling¶

This code outlines the procedure for topic modeling using advanced natural language processing (NLP) techniques, specifically utilizing the BERTopic model which incorporates embeddings, dimensionality reduction (UMAP), and clustering (HDBSCAN) to automatically categorize Dutch text data into topics:

  1. Setting Environment Variable:

    • os.environ["OMP_MAX_ACTIVE_LEVELS"] = "2"
    • This line sets the maximum number of nested active parallel regions for the OpenMP environment to '2'. This could be intended to optimize the performance or parallelism of the subsequent computational processes, particularly when using libraries that employ multi-threading.
  2. Loading a Pre-trained Embedding Model:

    • embedding_model = SentenceTransformer('all-MiniLM-L6-v2', language="Dutch")
    • This line initializes a multi-language embedding model, capable of converting sentences into numerical representations. The specific model used is designed for paraphrase identification, useful in understanding semantic similarities in Dutch text.
  3. Creating UMAP and HDBSCAN Models:

    • umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    • hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
    • UMAP (Uniform Manifold Approximation and Projection) is set up with specific parameters to perform dimensionality reduction, making it easier to cluster the data by reducing noise and retaining important structural information. HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) then clusters the data into groups based on density, using parameters like minimum cluster size and the method of cluster selection.
  4. Initializing and Fitting the BERTopic Model:

    • model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, nr_topics=12)
    • topics, probabilities = model.fit_transform(sentences)
    • This block initializes the BERTopic model with the embedding, UMAP, and HDBSCAN models configured earlier, and fits it on a set of Dutch sentences. This process clusters the sentences into different topics based on their semantic similarities.
  5. Retrieving and Displaying Topic Frequencies:

    • topic_freq = model.get_topic_freq()
    • After fitting, this retrieves the frequencies of each topic identified, showing how many documents or sentences are associated with each topic.
  6. Generating Topic Details and Summarization:

    • Here, details of each topic are generated, including:
      • A descriptive name based on the most representative words.
      • A representation list of top words in the topic.
      • Selected representative documents that exemplify the topic.
    • The information is collected into a DataFrame for easy viewing and manipulation, sorted by the frequency of each topic to prioritize more common topics.
  7. Displaying the Result:

    • print(topic_details_df)
    • Finally, this line displays the structured DataFrame containing detailed insights into each topic, including their representation and example documents, sorted by their prevalence or count.

This code effectively utilizes advanced machine learning techniques for unsupervised learning to discover and summarize topics within a Dutch text dataset, providing a comprehensive toolset for analyzing text data without predefined categories or topics.

In [ ]:
### Load your data from an Excel file
data = pd.read_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx')
content_titles = data['Combined'].fillna("").tolist()

### Load a pre-trained Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', language="Dutch")
In [ ]:
### BERTopic model (N = 90): in order to use it, take adjust from Markdown to Code

### N = 90

### Set the maximum number of nested active parallel regions
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "2"

### Create an instance of the UMAP and HDBSCAN model with specific parameters
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')

#Initialize the BERTopic model with the UMAP model and embedding model
model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', language='Dutch')
topics, probabilities = model.fit_transform(content_titles) # Fit the BERTopic model on the Dutch sentences to find topics

### Retrieve the most relevant topics sorted by their size (frequency)
topic_freq = model.get_topic_freq()

### Generate the topics and their details with adjustments
topic_details = []

for index, row in topic_freq.iterrows():
    topic_info = model.get_topic(row['Topic'])
    topic_representation = [word[0] for word in topic_info[:10]]  # Top 10 words as a list
    topic_name = "_".join(topic_representation[:4])  # Concatenate top 4 words for the name
    
    # Find multiple representative documents, assuming 3 for demonstration
    representative_doc_indices = [i for i, t in enumerate(topics) if t == row['Topic']][:3]
    representative_docs = [data['Combined'].iloc[i][:200] for i in representative_doc_indices]  # First 200 chars of docs
    
    topic_details.append({
        'Topic': row['Topic'],
        'Count': row['Count'],
        'Name': topic_name,
        'Representation': topic_representation,
        'Representative_Docs': representative_docs
    })

### Convert the list of topic details to a DataFrame and sort by topic size
topic_details_df = pd.DataFrame(topic_details)
topic_details_df.sort_values(by='Count', ascending=False, inplace=True)

### Display the DataFrame
print(topic_details_df)

Save the DataFrame to an Excel file¶

topic_details_df.to_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Topic_Info.xlsx', index=False)

Save to CSV¶

topic_details_df.to_csv('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Topic_Info.csv', index=False)

In [15]:
model.get_topics()
Out[15]:
{-1: [('we', 0.007486693671432411),
  ('jaar', 0.005841746514178842),
  ('mensen', 0.005760335472060161),
  ('wel', 0.0053916899802409326),
  ('intelligentie', 0.005176980062808073),
  ('zegt', 0.005122953483765532),
  ('kunstmatige', 0.005041534678303338),
  ('nieuwe', 0.004908017523229031),
  ('gaat', 0.004751231408093411),
  ('waar', 0.004696876955222825)],
 0: [('robot', 0.036894319081756936),
  ('men', 0.009610574570887872),
  ('mensen', 0.009350006461776817),
  ('we', 0.008977446982004506),
  ('machine', 0.00819859066770282),
  ('intelligentie', 0.008019534151900733),
  ('kunstmatige', 0.007426000831894131),
  ('banen', 0.006814265713362343),
  ('werk', 0.00596302546298388),
  ('gaan', 0.005880697831705383)],
 1: [('patiënten', 0.02054571676460878),
  ('patiënt', 0.01879193747453304),
  ('zorg', 0.013898342519898497),
  ('artsen', 0.012666471470354638),
  ('medische', 0.01208717804474124),
  ('philip', 0.010798196805020838),
  ('ziekenhuis', 0.010701476989546876),
  ('art', 0.010257983328512037),
  ('ziekenhuizen', 0.008857686281916579),
  ('zegt', 0.007659157110545914)],
 2: [('auto', 0.04843310437881789),
  ('rijden', 0.015910556461750054),
  ('zelfrijdende', 0.015823320394147612),
  ('tesla', 0.012237675204364211),
  ('elektrische', 0.012154891939629823),
  ('bestuurder', 0.00968288950438033),
  ('bmw', 0.009117512960606797),
  ('kilometer', 0.008714056144262496),
  ('stuur', 0.008384341857783556),
  ('mercedes', 0.008144737126493625)],
 3: [('beleggers', 0.026467443138532998),
  ('rente', 0.01610901335310583),
  ('inflatie', 0.01498591452145292),
  ('aex', 0.014748895709551435),
  ('aandelen', 0.014567585566011993),
  ('amerikaanse', 0.012617841423292532),
  ('aandeel', 0.01233940965485618),
  ('centrale', 0.012312847633199259),
  ('lager', 0.012121198454777196),
  ('beurs', 0.012107457442931171)],
 4: [('chatgpt', 0.03850379806447094),
  ('chatbot', 0.03411144536881021),
  ('chatbots', 0.018616167463824263),
  ('openai', 0.012682621379957747),
  ('google', 0.009953579546612235),
  ('bing', 0.009701879674701562),
  ('teksten', 0.009613864292455254),
  ('bot', 0.009543269691302208),
  ('antwoord', 0.009284442108851454),
  ('antwoorden', 0.009075252583562282)],
 5: [('europese', 0.03170400335016237),
  ('europa', 0.024239461613010913),
  ('eu', 0.02423497280347295),
  ('brussel', 0.01570189314577477),
  ('commissie', 0.014542876104929362),
  ('china', 0.013457062308428157),
  ('lidstaten', 0.011853367672038548),
  ('duitsland', 0.011299785777401388),
  ('macron', 0.011050098241167783),
  ('landen', 0.0107575713189513)],
 6: [('mln', 0.018129897408156365),
  ('bedrijven', 0.017915474626858302),
  ('investeerders', 0.013664917143833085),
  ('jaar', 0.013406939820455285),
  ('bedrijf', 0.013202050297342009),
  ('geld', 0.013168934888138422),
  ('fonds', 0.012642215878518027),
  ('investeringen', 0.011206077080362151),
  ('mrd', 0.010804337063322905),
  ('nederland', 0.009404007362804825)],
 7: [('film', 0.025833011587163484),
  ('the', 0.014197174045375014),
  ('sciencefiction', 0.012895229708015047),
  ('depp', 0.010680551410291204),
  ('regisseur', 0.009238444502253631),
  ('uur', 0.008910894908988363),
  ('genre', 0.00845613946079103),
  ('alien', 0.008452462082901522),
  ('machina', 0.008041264615053952),
  ('westworld', 0.007948262225432665)],
 8: [('dieren', 0.027959134970174982),
  ('natuur', 0.010794302118141613),
  ('we', 0.010456012053981563),
  ('soorten', 0.010005038172921043),
  ('bomen', 0.009659389135932351),
  ('dier', 0.009588568033883187),
  ('planten', 0.00800509912776644),
  ('geluiden', 0.007607768408833419),
  ('biodiversiteit', 0.006332818893363864),
  ('haven', 0.006302809649575891)],
 9: [('ai', 0.020015044217339985),
  ('intelligentie', 0.01585315766014987),
  ('computer', 0.014428266257230453),
  ('kunstmatige', 0.01381441795455606),
  ('men', 0.010040684554594215),
  ('machine', 0.009972216799732881),
  ('mensen', 0.009512063400004895),
  ('we', 0.009237523888398335),
  ('google', 0.00757490235430275),
  ('menselijke', 0.006635748392810741)],
 10: [('democratie', 0.01081273017934664),
  ('we', 0.008897075371402958),
  ('politieke', 0.008281272067715357),
  ('partij', 0.007810137396257425),
  ('politiek', 0.007741899787388279),
  ('mensen', 0.007338217756042906),
  ('partijen', 0.007160627204172948),
  ('samenleving', 0.006595275516290021),
  ('onze', 0.005985739397760472),
  ('kamer', 0.005976853840916498)],
 11: [('china', 0.05255032117446615),
  ('chinese', 0.025933538769083293),
  ('xi', 0.02005647185063052),
  ('trump', 0.011700512117273892),
  ('amerikaanse', 0.011065261366488506),
  ('landen', 0.010783824369749374),
  ('beijing', 0.010749226939086406),
  ('taiwan', 0.010111998813764643),
  ('land', 0.00995176688549835),
  ('economische', 0.00950688617621198)],
 12: [('brein', 0.03491756621367846),
  ('hersenen', 0.024459728120974376),
  ('neuralink', 0.015220383672101233),
  ('musk', 0.014533443235677734),
  ('chip', 0.012693202174289712),
  ('computer', 0.011654856109238337),
  ('gedachten', 0.010645746681771585),
  ('we', 0.010127378349971773),
  ('proefpersonen', 0.009755965645299134),
  ('neuronen', 0.009401203258197348)],
 13: [('roman', 0.018320271607424346),
  ('boek', 0.013719131023525087),
  ('frankenstein', 0.013164989492325165),
  ('verhaal', 0.012834789107015415),
  ('mary', 0.009142123340811248),
  ('shelley', 0.009006890324468542),
  ('leven', 0.008792163828805592),
  ('jezus', 0.008107720214931944),
  ('beslisser', 0.007993921982795684),
  ('fictie', 0.007741907994341967)],
 14: [('spel', 0.027329789144501505),
  ('game', 0.02602762248489176),
  ('schaken', 0.02397336232212592),
  ('computer', 0.01745600458746091),
  ('go', 0.013926178771724345),
  ('spelers', 0.013372154554443794),
  ('spelen', 0.013064686927075934),
  ('speler', 0.012290860464764474),
  ('spellen', 0.01214900749602897),
  ('alphago', 0.011181251430792582)],
 15: [('fonds', 0.01718232978919989),
  ('geld', 0.0170836528138853),
  ('kabinet', 0.015507136279966728),
  ('economische', 0.013941990209534324),
  ('hoekstra', 0.013800905329048093),
  ('euro', 0.013365841160077758),
  ('wiebes', 0.013280787061013885),
  ('economie', 0.013135545046681227),
  ('groeifonds', 0.01193689778422462),
  ('crisis', 0.011786022407887623)],
 16: [('banen', 0.029075959532796364),
  ('ai', 0.01727987218723208),
  ('werknemers', 0.016057461555312808),
  ('arbeidsmarkt', 0.015722247280857804),
  ('werk', 0.014919887726648146),
  ('wework', 0.00979397903856953),
  ('mensen', 0.00907184175858113),
  ('nieuwe', 0.00853580674894267),
  ('bedrijven', 0.007893885635176215),
  ('gaan', 0.007856328827606928)],
 17: [('valley', 0.030819368815575558),
  ('silicon', 0.02871996800888954),
  ('san', 0.01090180320034695),
  ('francisco', 0.01050290156722862),
  ('stanford', 0.01003659502451361),
  ('bedrijven', 0.008626651291092571),
  ('facebook', 0.006950928126626263),
  ('waar', 0.006871898576345437),
  ('bedrijf', 0.006688764543352133),
  ('techbedrijven', 0.006368574357398072)],
 18: [('microsoft', 0.07270962025768889),
  ('openai', 0.019374333620835266),
  ('nadella', 0.018428226602917937),
  ('alphabet', 0.016695184382550566),
  ('mrd', 0.016483994434767217),
  ('apple', 0.015297808245989907),
  ('dollar', 0.013144978482180228),
  ('bedrijf', 0.012853638145709667),
  ('miljard', 0.012706775282784168),
  ('google', 0.012513094328993096)],
 19: [('we', 0.007528589780243601),
  ('valk', 0.006629608536295808),
  ('wel', 0.006580696227294249),
  ('jaar', 0.00639973606913346),
  ('maria', 0.005802768208002155),
  ('familie', 0.005714953862863134),
  ('heel', 0.005592742431633042),
  ('goed', 0.005571436269298467),
  ('dhl', 0.005238346012580068),
  ('nederland', 0.005219273728395737)],
 20: [('asml', 0.05457544097786256),
  ('chip', 0.04910054466749747),
  ('kwartaal', 0.02645708286768284),
  ('omzet', 0.021359547783102166),
  ('wennink', 0.01955391258641443),
  ('mrd', 0.018885044108444018),
  ('tsmc', 0.017924419870879896),
  ('vraag', 0.016652493822619924),
  ('machine', 0.016089619160050674),
  ('bedrijf', 0.015661072714873974)],
 21: [('china', 0.03304394963518705),
  ('chinese', 0.032718086928642566),
  ('overheid', 0.013446449339427438),
  ('bedrijven', 0.011673661516661096),
  ('chinezen', 0.010992568995933394),
  ('peking', 0.010880668129019017),
  ('amerikaanse', 0.009484798464462202),
  ('tiktok', 0.009014391535767633),
  ('data', 0.0075168864984134325),
  ('huawei', 0.007500668189391401)],
 22: [('muziek', 0.04355922681086032),
  ('piano', 0.02030950012400095),
  ('universal', 0.018121508902470176),
  ('artiesten', 0.01722795505273336),
  ('tiktok', 0.015178493651215607),
  ('liedje', 0.014924765640932196),
  ('beethoven', 0.014907882481561883),
  ('nummer', 0.013391229174223515),
  ('spotify', 0.013345859805017984),
  ('stem', 0.012291892137664873)],
 23: [('ai', 0.02283307273986205),
  ('nederland', 0.016999920143917543),
  ('intelligentie', 0.01595990087832184),
  ('bedrijven', 0.014806636642981336),
  ('europa', 0.014608275850770588),
  ('investeren', 0.014108628091212442),
  ('kunstmatige', 0.013426913829129871),
  ('universiteiten', 0.012882362358324097),
  ('rijke', 0.012827008305885604),
  ('kabinet', 0.012544011242746777)],
 24: [('medium', 0.014184759293968617),
  ('nepnieuws', 0.013313360069305126),
  ('verkiezingen', 0.011506851519843255),
  ('desinformatie', 0.011257252523194518),
  ('empathie', 0.010856642187642066),
  ('trump', 0.009510688763654357),
  ('sociale', 0.008940136027120672),
  ('facebook', 0.008637944800341397),
  ('hamas', 0.008326415013536124),
  ('deepfakes', 0.0074517721712175225)],
 25: [('kunst', 0.018623475795978978),
  ('nachtwacht', 0.01834239840411389),
  ('museum', 0.01731988235630514),
  ('rembrandt', 0.017022868762467166),
  ('schilderij', 0.015137371325980446),
  ('werk', 0.014536727011108523),
  ('kunstenaar', 0.011448652448489173),
  ('rijksmuseum', 0.010718256950097073),
  ('kunstenaars', 0.010623974295294419),
  ('zien', 0.009411188466704908)],
 26: [('china', 0.05659312419989554),
  ('asml', 0.052236535494392854),
  ('chip', 0.050337745550440176),
  ('geavanceerde', 0.028479297854525448),
  ('chinese', 0.02811836239547994),
  ('amerikaanse', 0.024563009090252733),
  ('export', 0.01951203536144393),
  ('chipmachines', 0.016712161910090863),
  ('nanometer', 0.014910724276986299),
  ('machine', 0.014405894830052246)],
 27: [('we', 0.014831080602368343),
  ('mensheid', 0.014614574617482565),
  ('aarde', 0.010760385177413678),
  ('rees', 0.01075292985442079),
  ('toekomst', 0.009583059162370086),
  ('ai', 0.009296941103872434),
  ('men', 0.00878249942767476),
  ('risico', 0.008493905576458263),
  ('mensen', 0.007591491517162478),
  ('leven', 0.0074035398094496226)],
 28: [('europese', 0.031041140471569476),
  ('ai', 0.03045766751641336),
  ('regels', 0.029648282200635916),
  ('parlement', 0.02331293192866366),
  ('act', 0.02057936663468225),
  ('europees', 0.019629179011075287),
  ('wet', 0.01830853027051129),
  ('gezichtsherkenning', 0.017526351979474157),
  ('commissie', 0.016882362876275748),
  ('eu', 0.01660525919491919)],
 29: [('digitale', 0.026640486416918113),
  ('digitalisering', 0.02029556797538793),
  ('overheid', 0.012953276052304919),
  ('burger', 0.012782211629625426),
  ('we', 0.012507854398083513),
  ('data', 0.0121576476508313),
  ('onze', 0.010684634520230274),
  ('stad', 0.009904874409303838),
  ('technologie', 0.009205533091304114),
  ('politieke', 0.007465818252487621)],
 30: [('facebook', 0.07362798990987332),
  ('zuckerberg', 0.02227912090320167),
  ('gebruikers', 0.01566244146484264),
  ('berichten', 0.01295317382572982),
  ('platform', 0.00982646413459673),
  ('bedrijf', 0.009646861643307525),
  ('mensen', 0.00956030658121607),
  ('erik', 0.009161775236406034),
  ('nepnieuws', 0.009106100744170339),
  ('charon', 0.008714566640963605)],
 31: [('onderwijs', 0.03317593682511713),
  ('studenten', 0.02912873187124027),
  ('universiteiten', 0.02109638135362824),
  ('universiteit', 0.015728112754281848),
  ('school', 0.012269018684016423),
  ('scholen', 0.012241692800035838),
  ('studie', 0.01161492990082033),
  ('technische', 0.011109649894253684),
  ('hoger', 0.010379882463721173),
  ('student', 0.009858215311171882)],
 32: [('nvidia', 0.1382643492761839),
  ('chip', 0.039101525348793194),
  ('huang', 0.03263532195478313),
  ('intel', 0.020780675887409304),
  ('bedrijf', 0.020642361948870758),
  ('grafische', 0.019567704589832487),
  ('mrd', 0.017942506163389615),
  ('jensen', 0.017275434305214905),
  ('aandeel', 0.016494566138732316),
  ('omzet', 0.01625381773195205)],
 33: [('coronavirus', 0.01996576214599905),
  ('china', 0.019407004473973515),
  ('virus', 0.0162571321070852),
  ('chinese', 0.010789865432575589),
  ('ggd', 0.009271750289954266),
  ('corona', 0.009031967577803751),
  ('palantir', 0.00901515736706658),
  ('ademtest', 0.00816679573979107),
  ('pandemie', 0.007483301553037099),
  ('antisemitische', 0.007339871655480623)],
 34: [('foto', 0.039847876224386625),
  ('gezichtsherkenning', 0.03342935492184306),
  ('gezichten', 0.02105758048190932),
  ('clearview', 0.019312662972965522),
  ('camera', 0.01567420935545189),
  ('gezicht', 0.012890046842553203),
  ('technologie', 0.010578424245542136),
  ('software', 0.009425047368751407),
  ('politie', 0.008835769201594698),
  ('google', 0.00879840853934739)],
 35: [('winkel', 0.02540748955722635),
  ('zelfscankassa', 0.024162619436767608),
  ('klanten', 0.018897124443852734),
  ('winkeldiefstal', 0.01587445521649987),
  ('winkels', 0.01538491273235142),
  ('diefstal', 0.014799843325932415),
  ('supermarkten', 0.014770676446910077),
  ('supermarkt', 0.0139787956130453),
  ('jumbo', 0.01359793855836441),
  ('boodschappen', 0.013411632165475706)],
 36: [('acteurs', 0.06179829590791042),
  ('staking', 0.061241210964896646),
  ('schrijvers', 0.04659947836446929),
  ('hollywood', 0.042574317046335054),
  ('streamingdiensten', 0.031688083596126876),
  ('wga', 0.026833757233060706),
  ('studio', 0.02608735095031966),
  ('series', 0.024464478577851875),
  ('film', 0.02438477732222389),
  ('vakbond', 0.022641272413075442)],
 37: [('samsung', 0.07838884673408475),
  ('apple', 0.02072110477118594),
  ('smartphone', 0.017475769158254006),
  ('telefoons', 0.017185277019363454),
  ('huawei', 0.017084772328089956),
  ('telefoon', 0.015755056589734664),
  ('lee', 0.014156562685128419),
  ('iphone', 0.0138529218261527),
  ('galaxy', 0.012614670387571268),
  ('nieuwe', 0.01180484239346215)],
 38: [('apple', 0.11222821236149473),
  ('iphone', 0.033639690205776514),
  ('cook', 0.018244713504359366),
  ('kinderporno', 0.015675455763230234),
  ('google', 0.013944139074658583),
  ('auto', 0.011173352352370384),
  ('iphones', 0.010939457039879684),
  ('amazon', 0.010677318916326825),
  ('jaar', 0.010624317139389104),
  ('nieuwe', 0.010226730864612963)],
 39: [('journalistiek', 0.02437992171608028),
  ('journalisten', 0.020096238875964692),
  ('nieuws', 0.018634344266459307),
  ('artikelen', 0.014315531955046062),
  ('ai', 0.01364353057946019),
  ('medium', 0.010237571935797121),
  ('krant', 0.009800068360765229),
  ('mensen', 0.009452105588919643),
  ('informatie', 0.00938257374893693),
  ('channel', 0.008789438160282706)],
 40: [('algoritmes', 0.03186138509593045),
  ('data', 0.02905795876899499),
  ('algoritmen', 0.021501745176739733),
  ('register', 0.02014453477121616),
  ('big', 0.01704017172076839),
  ('burger', 0.016376090130409665),
  ('gemeente', 0.01495775382190589),
  ('algoritme', 0.013804508762642125),
  ('toezicht', 0.013534813679555967),
  ('overheid', 0.012134597172895677)],
 41: [('nederland', 0.02545478613080527),
  ('economie', 0.012440853139932283),
  ('bedrijven', 0.011577138467594662),
  ('landen', 0.010367445847695545),
  ('nederlandse', 0.009631545549985),
  ('schwarz', 0.009524851656561596),
  ('volberda', 0.009065504051425415),
  ('investeringen', 0.008771879299791655),
  ('we', 0.008743880119130885),
  ('wapenfeit', 0.008726124348079832)],
 42: [('deepfakes', 0.04069482968773314),
  ('video', 0.024093761523222547),
  ('deepfake', 0.023708647367245567),
  ('beelden', 0.015584472389228827),
  ('porno', 0.014621918150414394),
  ('foto', 0.012946894389676248),
  ('deepfakeporno', 0.012501959801129385),
  ('seksueel', 0.012079087112129806),
  ('vrouwen', 0.011616380999576391),
  ('slachtoffer', 0.011480126720331961)],
 43: [('bank', 0.048623689926658185),
  ('banken', 0.0481008927654361),
  ('ing', 0.02610760315206898),
  ('financiële', 0.021370235433630827),
  ('revolut', 0.019857046430125078),
  ('fintech', 0.01890675776403956),
  ('klanten', 0.016402667555210956),
  ('abn', 0.01632346049466571),
  ('sector', 0.015592290604082706),
  ('amro', 0.015054952281422935)],
 44: [('studenten', 0.03823428686360691),
  ('docenten', 0.023542144413973218),
  ('onderwijs', 0.022271720767519626),
  ('leerlingen', 0.02069375249449165),
  ('programmeren', 0.017713637782671213),
  ('leren', 0.014987330654177195),
  ('chatgpt', 0.014261773105292974),
  ('vaardigheden', 0.013975193533944181),
  ('opleidingen', 0.013898894138339992),
  ('scholen', 0.013745955221577035)],
 45: [('film', 0.018261687889153062),
  ('couture', 0.014296797144129373),
  ('herzog', 0.012963508351857129),
  ('marsigliese', 0.011700504737550367),
  ('dior', 0.011342545700525156),
  ('chanel', 0.010946621935943675),
  ('uur', 0.010346436447664793),
  ('dance', 0.00985195974234931),
  ('personage', 0.009312296236949976),
  ('show', 0.008737982146009193)],
 46: [('google', 0.07643432514953216),
  ('zoekmachine', 0.033056215564139214),
  ('alphabet', 0.019387924412855893),
  ('microsoft', 0.016338812566557264),
  ('gemini', 0.01583634798907),
  ('gebruikers', 0.01435887055982529),
  ('data', 0.013659258361361741),
  ('page', 0.013637143391797817),
  ('bedrijf', 0.013016339938508734),
  ('bing', 0.012218660067047452)],
 47: [('dnb', 0.09049556653438254),
  ('bunq', 0.0773111139510025),
  ('banken', 0.06678080768852333),
  ('witwassen', 0.03906454773714225),
  ('bank', 0.03765731808882712),
  ('toezichthouder', 0.03576303565089094),
  ('transacties', 0.028263292715932826),
  ('klanten', 0.024213681221675585),
  ('niknam', 0.021616513219079064),
  ('financiële', 0.02098139723558336)],
 48: [('oekraïne', 0.03745135366486887),
  ('drone', 0.029988022131922327),
  ('oorlog', 0.02263246953352628),
  ('russische', 0.022308243942435965),
  ('oekraïense', 0.018523852949238345),
  ('militaire', 0.014836120841785599),
  ('rusland', 0.014142339813023532),
  ('thales', 0.0136247653725483),
  ('wapens', 0.01257684854528385),
  ('russen', 0.01103601481349522)],
 49: [('privacy', 0.0323302433463922),
  ('avg', 0.019931577463772065),
  ('data', 0.019389952376528366),
  ('europese', 0.013993969536490932),
  ('gegevens', 0.012451953172739678),
  ('facebook', 0.012186627732606283),
  ('google', 0.01095620717871909),
  ('microklussers', 0.010938383526115498),
  ('advertenties', 0.01059946012757427),
  ('microklussen', 0.010342799095003359)],
 50: [('vrouwen', 0.023235538471334766),
  ('mannen', 0.01812429203182792),
  ('seks', 0.01138792491076651),
  ('homo', 0.010774500149032848),
  ('vrouw', 0.009970067569380786),
  ('kosinski', 0.009262922035193216),
  ('epstein', 0.008992170771015374),
  ('seksuele', 0.007938273239968171),
  ('we', 0.007848067121773702),
  ('man', 0.007541549401764835)],
 51: [('arbeidsmarkt', 0.0240093875688311),
  ('arbeidsproductiviteit', 0.0226865825153014),
  ('werknemers', 0.0142446414947354),
  ('werkgevers', 0.013746245749089841),
  ('productiviteit', 0.013568126038422916),
  ('economie', 0.013017484557473223),
  ('werken', 0.012720098482658183),
  ('productiviteitsgroei', 0.012697634902260226),
  ('nederland', 0.012001366274995382),
  ('sectoren', 0.0118802600855739)],
 52: [('drone', 0.08841573992697767),
  ('ballon', 0.019045797278174313),
  ('vliegen', 0.01842712560128632),
  ('lucht', 0.018070006791596593),
  ('delft', 0.01400321999182485),
  ('riemens', 0.013409627783314447),
  ('ballonnen', 0.009389396196219282),
  ('middendorp', 0.009228414862249177),
  ('zegt', 0.009153844673451527),
  ('oostrum', 0.009111559560882151)],
 53: [('virtuele', 0.030923558532187347),
  ('avatar', 0.030837969665672888),
  ('metaverse', 0.023516051794602096),
  ('virtual', 0.018480223260516418),
  ('reality', 0.018270828329401263),
  ('vr', 0.01413782954533946),
  ('digitale', 0.012786368879852576),
  ('metaversum', 0.011634082573884577),
  ('bril', 0.011332944348925194),
  ('virtueel', 0.011226196244377415)],
 54: [('taal', 0.027963328622367185),
  ('translate', 0.027015390166831306),
  ('vertalers', 0.02343929571709614),
  ('vertalingen', 0.022155758866782886),
  ('talen', 0.020411989250542516),
  ('vertalen', 0.019297901992057564),
  ('gebarentaal', 0.017308988349071786),
  ('vertaling', 0.016832348458685337),
  ('google', 0.016748560934286873),
  ('arabisch', 0.01585642698951597)],
 55: [('wapens', 0.0781336319779496),
  ('autonome', 0.05072535314780074),
  ('robot', 0.028747503311168317),
  ('killer', 0.025482076120396695),
  ('verbod', 0.018469299357596825),
  ('drone', 0.015600537299756457),
  ('autonoom', 0.014290267828753228),
  ('pax', 0.013871531406333312),
  ('wapensystemen', 0.013510335453636288),
  ('brief', 0.013507795228199558)],
 56: [('spelers', 0.026279769356587154),
  ('club', 0.024277135109256463),
  ('bal', 0.02204047737100079),
  ('voetbal', 0.020550381467079316),
  ('scisports', 0.018078381142403635),
  ('sport', 0.01754475511944718),
  ('wedstrijden', 0.015215056840477597),
  ('ajax', 0.01354937635238009),
  ('brouwer', 0.013526641347020819),
  ('wedstrijd', 0.013519471468271333)],
 57: [('algoritme', 0.04389544885480664),
  ('algoritmes', 0.04153184732150761),
  ('algoritmen', 0.03678659743190971),
  ('beslissingen', 0.019003189445809726),
  ('fry', 0.01570488929149069),
  ('mensen', 0.013292812246911414),
  ('algoritmische', 0.012152977993770929),
  ('we', 0.011965433829654338),
  ('menselijke', 0.010417969542037204),
  ('fouten', 0.009323198409820235)],
 58: [('klm', 0.04325929006531934),
  ('vliegtuig', 0.030714700310642634),
  ('vliegtuigen', 0.026888690808260667),
  ('schiphol', 0.02250176884540963),
  ('passagiers', 0.0186936508232281),
  ('rintel', 0.018078236084303693),
  ('vliegen', 0.017437474235092724),
  ('piloot', 0.01655168472801233),
  ('luchthaven', 0.01482464204808255),
  ('toestellen', 0.012897553770407291)],
 59: [('stem', 0.03099679375851028),
  ('tinnitus', 0.029597134232338985),
  ('geluid', 0.018022601058312163),
  ('ridder', 0.014544277903284615),
  ('whispp', 0.013006014233020669),
  ('via', 0.012669533327183614),
  ('starkey', 0.01243584641985006),
  ('castermans', 0.011758726578725362),
  ('serdijn', 0.010383303466736547),
  ('sawalich', 0.010383303466736547)],
 60: [('hacker', 0.027636735308037255),
  ('moerel', 0.020097568469805694),
  ('bedrijven', 0.012031390771884577),
  ('cybersecurity', 0.012010053289878135),
  ('criminelen', 0.01113611810453505),
  ('website', 0.0101717402378479),
  ('cybersprint', 0.009740114424781496),
  ('phishing', 0.009196243438149872),
  ('ransomware', 0.008835109119620039),
  ('internet', 0.008801859853171294)],
 61: [('altman', 0.14102119698967272),
  ('openai', 0.1015710325300196),
  ('bestuur', 0.05747783384413201),
  ('sam', 0.038525689131678136),
  ('ontslag', 0.03568949222590706),
  ('topman', 0.03261254722196201),
  ('brockman', 0.03193199604837349),
  ('soros', 0.0277330150238057),
  ('microsoft', 0.025840846128625673),
  ('sutskever', 0.022742284100814896)],
 62: [('wapens', 0.037579541031739806),
  ('autonome', 0.03184030739144296),
  ('militaire', 0.02264954938527403),
  ('drone', 0.021960913663308154),
  ('defensie', 0.016309512250959383),
  ('wapensystemen', 0.015821213182588167),
  ('ai', 0.014663109982916516),
  ('militairen', 0.01329124972569004),
  ('systemen', 0.011888935495414376),
  ('verdrag', 0.010969135966135834)],
 63: [('cloud', 0.07169687990321258),
  ('microsoft', 0.03731169961703853),
  ('amazon', 0.024516316933465624),
  ('oracle', 0.024012023188955206),
  ('google', 0.02338510395534385),
  ('bedrijven', 0.01962741711526994),
  ('data', 0.017917529013155954),
  ('clouddiensten', 0.017832008943814242),
  ('azure', 0.01588130498704494),
  ('mrd', 0.01418703034539035)],
 64: [('chainalysis', 0.031963491878098646),
  ('cryptomunten', 0.027006183239194598),
  ('bitcoin', 0.02068948583400805),
  ('coinbase', 0.016489240726289874),
  ('worldcoin', 0.015082309304071942),
  ('bitcoins', 0.01497978318632785),
  ('gronager', 0.013531356710740879),
  ('criminelen', 0.013153100927644957),
  ('cryptomunt', 0.01287562658619239),
  ('cryptovaluta', 0.01170242524877489)],
 65: [('datacenters', 0.09250453908788336),
  ('datacentra', 0.024567228169454766),
  ('datacenter', 0.02375277534878531),
  ('stroom', 0.023305579912576445),
  ('dda', 0.022812290526099153),
  ('nederland', 0.02236831597883887),
  ('grove', 0.01835216162280343),
  ('apparatuur', 0.01699588264289452),
  ('amsterdam', 0.014118266961348285),
  ('huawei', 0.012833675795747622)],
 66: [('ai', 0.024373660359767502),
  ('sheikh', 0.022911151137812825),
  ('zelfrijdende', 0.019768704283732595),
  ('intelligentie', 0.0193889254594412),
  ('kunstmatige', 0.018835880289904888),
  ('wrr', 0.01845407464686386),
  ('afm', 0.017954740887184673),
  ('auto', 0.0173475210179342),
  ('dobbelaere', 0.01612973953465127),
  ('rapport', 0.015302639539079448)],
 67: [('advocaten', 0.047869690048671906),
  ('juridische', 0.045160477643869414),
  ('legal', 0.02942688694104348),
  ('advocaat', 0.02693618278359918),
  ('kantoren', 0.024629087472820704),
  ('advocatuur', 0.022606747533347554),
  ('advocatenkantoren', 0.02123340688089756),
  ('legaltech', 0.018696998886393815),
  ('juristen', 0.015982974519594486),
  ('overy', 0.015654107330778065)],
 68: [('kandidaten', 0.04367373625548949),
  ('recruiter', 0.04059988443337879),
  ('cv', 0.02841701933068386),
  ('kandidaat', 0.023956661099390424),
  ('sollicitanten', 0.020445619350926883),
  ('oostrom', 0.019436682990122387),
  ('algoritme', 0.01736537713714318),
  ('akhlal', 0.016339307318545562),
  ('unilever', 0.013573547331955952),
  ('sollicitant', 0.012513266790041628)],
 69: [('ai', 0.04440479196872834),
  ('risico', 0.027107681214019953),
  ('sunak', 0.022799892252258223),
  ('bletchley', 0.019591834607778892),
  ('britse', 0.015207195899922171),
  ('wetgeving', 0.014117647977543557),
  ('technologie', 0.012597777224588965),
  ('top', 0.011148902166749783),
  ('we', 0.010859002736126798),
  ('park', 0.010810450500599724)],
 70: [('russische', 0.08139310949383995),
  ('volozj', 0.07013517038729875),
  ('yandex', 0.0516088989642387),
  ('aivd', 0.03746322566718366),
  ('spionage', 0.03694751963616592),
  ('diplomaten', 0.033952971870485016),
  ('russen', 0.02939391204516434),
  ('rusland', 0.028250598128174105),
  ('sanctielijst', 0.02713202981735881),
  ('ambassade', 0.026961131735766673)],
 71: [('musk', 0.06650748951511597),
  ('tesla', 0.030365478445581934),
  ('spacex', 0.028314404380681616),
  ('elon', 0.026090808052061758),
  ('aarde', 0.020386576924022716),
  ('mar', 0.018498118552802065),
  ('raket', 0.01755896371306701),
  ('isaacson', 0.01441471552480549),
  ('cooijmans', 0.014198992401643771),
  ('ruimtevaart', 0.013119357697234069)],
 72: [('politie', 0.04781282485078935),
  ('plas', 0.03362713788792385),
  ('criminelen', 0.01928401973533707),
  ('gegevens', 0.017024873007959004),
  ('criminaliteit', 0.016655551915115818),
  ('cybercrime', 0.016388437673752608),
  ('aangifte', 0.01623265590662635),
  ('wolfert', 0.014087950454538587),
  ('aangiftes', 0.01339569906638745),
  ('cold', 0.012624545326596286)],
 73: [('chinese', 0.0430256874650782),
  ('china', 0.03808194540686919),
  ('universiteiten', 0.03361741992495753),
  ('campus', 0.030269088053335946),
  ('boekhoorn', 0.025857127204376187),
  ('samenwerking', 0.024108856450804114),
  ('jorritsma', 0.02396477886377274),
  ('studenten', 0.02133448062699776),
  ('nederlandse', 0.02090022547250005),
  ('onderzoek', 0.017712554526376992)],
 74: [('loznitsa', 0.019922144019883646),
  ('hoofddoek', 0.014497285714296558),
  ('facebook', 0.013162652329102157),
  ('ressa', 0.012158517472742275),
  ('regime', 0.012046000234721532),
  ('vrouwen', 0.011851095935147798),
  ('hamas', 0.011421999480738727),
  ('cvz', 0.01092148489653135),
  ('russische', 0.01082440582285944),
  ('klette', 0.01067176084111041)],
 75: [('poolwervel', 0.03702436451447165),
  ('muon', 0.03207256922877086),
  ('satellieten', 0.03150133731368994),
  ('aarde', 0.02798953606905963),
  ('meteorieten', 0.02547428486986456),
  ('satelliet', 0.022027462301644433),
  ('methaan', 0.022008983824331148),
  ('noordpool', 0.021245578681822707),
  ('sark', 0.018425023697900594),
  ('hyperscout', 0.01720848573053478)],
 76: [('smartphone', 0.050619241523059626),
  ('lens', 0.034953763007363144),
  ('telefoon', 0.02439119070269335),
  ('wouter', 0.021235008917102158),
  ('mobieltjes', 0.020990203733791027),
  ('google', 0.018999667553875284),
  ('pixel', 0.016500596008511864),
  ('leerlingen', 0.01629585114799356),
  ('iphone', 0.016234694789581377),
  ('verbod', 0.015075568259468791)],
 77: [('hawking', 0.054165272806966425),
  ('andringa', 0.03381514166296401),
  ('wilczek', 0.019609269884495677),
  ('natuurkunde', 0.017666774996546227),
  ('boek', 0.012931733538506315),
  ('stephen', 0.012856808560909668),
  ('wetenschap', 0.012085276855272102),
  ('heelal', 0.01019552655545034),
  ('generalisten', 0.009757623348562427),
  ('zwarte', 0.009225484331909157)],
 78: [('biesheuvel', 0.04002041556629835),
  ('ema', 0.03646489363433604),
  ('woerdt', 0.01755586145323328),
  ('nederland', 0.01603492718013505),
  ('wehkamp', 0.014857323518077071),
  ('ramakers', 0.013283343343572623),
  ('gamecongres', 0.011812053215886353),
  ('unilever', 0.011720414926673736),
  ('ondernemers', 0.01168674012791996),
  ('indigo', 0.011456108602901477)],
 79: [('chip', 0.042423350031513096),
  ('asml', 0.03529431417119637),
  ('intel', 0.029005074453676803),
  ('europa', 0.028832796644165416),
  ('europese', 0.021069444553720083),
  ('fabriek', 0.0203544216823179),
  ('nauta', 0.017472708701010028),
  ('wennink', 0.01741550066316771),
  ('asmi', 0.016370730206957215),
  ('miljard', 0.014511816497356883)],
 80: [('krant', 0.03526574497927175),
  ('dpg', 0.027727130991532525),
  ('kranten', 0.02610479077986151),
  ('nieuws', 0.015333359359573237),
  ('lezers', 0.014773755648927208),
  ('thillo', 0.014597649796703811),
  ('bild', 0.01398253932227877),
  ('guardian', 0.013089257924132314),
  ('oktober', 0.012519500697486082),
  ('medium', 0.01223343327041974)],
 81: [('bussche', 0.04526674649416625),
  ('podcast', 0.045228788729228614),
  ('podcasts', 0.034623072743861866),
  ('rogan', 0.027704154029604782),
  ('den', 0.025011571402178096),
  ('spotify', 0.022400191829297977),
  ('kwebbelkop', 0.015736803370334486),
  ('jordi', 0.015337340265690568),
  ('youtube', 0.013017907716950761),
  ('strengholt', 0.012754523076080819)],
 82: [('huawei', 0.0983974968933943),
  ('universiteiten', 0.04623969139965946),
  ('chinese', 0.038498490269710854),
  ('samenwerking', 0.03437739583316361),
  ('harmelen', 0.026495051279462948),
  ('instituten', 0.021956172817828454),
  ('china', 0.021786305940049717),
  ('confucius', 0.017183473976952867),
  ('wetenschappers', 0.01712817604018451),
  ('vu', 0.0163415435985534)],
 83: [('amazon', 0.09375427812943085),
  ('bezos', 0.03355596884906663),
  ('platform', 0.02388895718633598),
  ('jeff', 0.013371015778371619),
  ('anthropic', 0.012880667475770755),
  ('duitsland', 0.012483738294385432),
  ('werknemers', 0.012084724462030287),
  ('vogels', 0.011922113873336462),
  ('alexa', 0.010937747321506725),
  ('choudary', 0.010830737007990594)],
 84: [('bamps', 0.025203427950659676),
  ('data', 0.01610127780589552),
  ('source', 0.013595740898488343),
  ('onze', 0.012387276927014952),
  ('roman', 0.011638330436690856),
  ('koenig', 0.01116036588330626),
  ('we', 0.010982185457777741),
  ('meta', 0.010439373130131324),
  ('open', 0.010409098878651551),
  ('russell', 0.010360852575021541)],
 85: [('twitter', 0.08520151960524652),
  ('musk', 0.05443472880353841),
  ('nepaccounts', 0.031888808778552075),
  ('tweet', 0.027872284047031078),
  ('mastodon', 0.027746572035183388),
  ('account', 0.018344430302111348),
  ('socialemediabedrijven', 0.016696621443996362),
  ('elon', 0.01600870568006368),
  ('medium', 0.01542855180234373),
  ('gebruikers', 0.01515647363228872)],
 86: [('god', 0.024611223189918593),
  ('ai', 0.014788128259455668),
  ('blok', 0.014619879911181298),
  ('broersen', 0.014168532376730833),
  ('harari', 0.013308997315482353),
  ('men', 0.013267570219735491),
  ('we', 0.011783620641204228),
  ('bostrom', 0.0110896133746554),
  ('goddelijke', 0.011030459931300423),
  ('mensen', 0.010863308843100734)],
 87: [('youtube', 0.08270607943481365),
  ('video', 0.034412667392502365),
  ('filmpjes', 0.0279971171361612),
  ('facebook', 0.0216824135093742),
  ('tang', 0.02167909936999572),
  ('hoven', 0.017641962554577915),
  ('inhoud', 0.01702017339353581),
  ('kinderen', 0.01615454961349382),
  ('moderator', 0.014701635462148263),
  ('platformen', 0.0138363228933664)],
 88: [('online', 0.0224510221865735),
  ('unless', 0.01862990139978048),
  ('concept', 0.01585963649283838),
  ('klanten', 0.013750396304005747),
  ('prevoo', 0.013623565578826909),
  ('wappzapp', 0.013623565578826909),
  ('klant', 0.013433849455625677),
  ('nagtegaal', 0.013405432896362812),
  ('ezrachi', 0.012853633983237679),
  ('veelbelovend', 0.012774977006155644)],
 89: [('aardbevingen', 0.03138213977702712),
  ('hooper', 0.026675417540499822),
  ('aardbeving', 0.025881833909104462),
  ('vulkanen', 0.017921012137610437),
  ('johnson', 0.017482656771525853),
  ('voorspellen', 0.016458889294327133),
  ('extreem', 0.016453868566745643),
  ('vulkaan', 0.01481967641138879),
  ('model', 0.014703219559803919),
  ('aardplaten', 0.014587569609209568)],
 90: [('universiteiten', 0.03767973971984582),
  ('spionage', 0.03253955844351508),
  ('chinese', 0.02717989517608118),
  ('hikvision', 0.026863879311577233),
  ('loket', 0.02597386549937508),
  ('diercks', 0.023562113419335676),
  ('nctv', 0.02130248046495377),
  ('kennis', 0.020581128409258003),
  ('mivd', 0.020456442475896754),
  ('samenwerkingen', 0.01906822291910119)]}
In [16]:
model.visualize_barchart(top_n_topics=90)
In [21]:
topic_details_df
Out[21]:
Topic Count Name Representation Representative_Docs
0 -1 2498 we_jaar_mensen_wel [we, jaar, mensen, wel, intelligentie, zegt, k... [kunstmatige intelligentie best bedreigend kun...
1 0 432 robot_men_mensen_we [robot, men, mensen, we, machine, intelligenti... [tekenfilm maken gaat ai stuk sneller animatie...
2 1 147 patiënten_patiënt_zorg_artsen [patiënten, patiënt, zorg, artsen, medische, p... [kunstmatige intelligentie verslaat artsen ops...
3 2 136 auto_rijden_zelfrijdende_tesla [auto, rijden, zelfrijdende, tesla, elektrisch... [robot betere kunstmatige intelligentie zelfri...
4 3 123 beleggers_rente_inflatie_aex [beleggers, rente, inflatie, aex, aandelen, am... [oorlogen trage economische groei laten belegg...
... ... ... ... ... ...
87 86 12 god_ai_blok_broersen [god, ai, blok, broersen, harari, men, we, bos... [bedreigt kunstmatige intelligentie godsbeeld ...
88 87 12 youtube_video_filmpjes_facebook [youtube, video, filmpjes, facebook, tang, hov... [youtube grijpt mountain view youtube afgelope...
89 88 11 online_unless_concept_klanten [online, unless, concept, klanten, prevoo, wap... [online gaat winkelstraat nooit dicht verslave...
90 89 10 aardbevingen_hooper_aardbeving_vulkanen [aardbevingen, hooper, aardbeving, vulkanen, j... [lezersreacties tof krant nieuws mei aandacht ...
91 90 10 universiteiten_spionage_chinese_hikvision [universiteiten, spionage, chinese, hikvision,... [laat nederland speelbal minister generaals pr...

92 rows × 5 columns

Based on analyzing the topics and content, the following topics were manually merged:

  • -1 AI in society: -1
  • 0 Health: 1, 12, 19, 33, 50, 54, 56, 59
  • 1 Environment: 8, 58, 75, 78, 89
  • 2 Art: 7, 13, 22, 25, 36, 45,
  • 3 Media: 14, 39, 85, 88, 80 ,81, 87
  • 4 Economy: 3, 6, 15, 16, 43, 47, 51, 64, 68
  • 5 Law: 24, 34, 35, 42, 52, 55, 60, 62, 67, 72
  • 6 Politics: 5, 10, 11, 21, 28, 40, 41, 48, 49, 66, 69, 70, 74, 79, 23
  • 7 Education: 31, 44, 73, 76, 77, 82, 86, 90
  • 8 Technology: 0, 2, 9, 27, 29, 53, 57, 65
  • 9 Business: 4, 17, 18, 20, 26, 30, 32, 37, 38, 46, 61, 63, 71, 83
In [17]:
# Merge the specified topics
topics_to_merge = [[1, 12, 19, 33, 50, 54, 56, 59],
                   [8, 58, 75, 78, 89],
                   [7, 13, 22, 25, 36, 45],
                   [14, 39, 85, 88, 80 ,81, 87],
                   [3, 6, 15, 16, 43, 47, 51, 64, 68],
                   [24, 34, 35, 42, 52, 55, 60, 62, 67, 72],
                   [5, 10, 11, 21, 28, 40, 41, 48, 49, 66, 69, 70, 74, 79, 23],
                   [31, 44, 73, 76, 77, 82, 86, 90],
                   [0, 2, 9, 27, 29, 53, 57, 65, 84],
                   [4, 17, 18, 20, 26, 30, 32, 37, 38, 46, 61, 63, 71, 83]]
model.merge_topics(content_titles, topics_to_merge)
In [34]:
# Get the topics and their representations
topics_info = model.get_topic_info()

# Save the topics information DataFrame to an Excel file
topics_info.to_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Merged_Topic_Info.xlsx', index=False)

# Save the topics information DataFrame to a CSV file
topics_info.to_csv('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Merged_Topic_Info.csv', index=False)

Topics arranged:

  • -1 AI in society: -1
  • 0 Technology: 2, 9, 27, 29, 53, 57, 65
  • 1 Politics: 5, 10, 11, 21, 28, 40, 41, 48, 49, 66, 69, 70, 74, 79, 23
  • 2 Business: 4, 17, 18, 20, 26, 30, 32, 37, 38, 46, 61, 63, 71, 83
  • 3 Economy: 3, 6, 15, 16, 43, 47, 51, 64, 68
  • 4 Health: 1, 12, 19, 33, 50, 54, 56, 59
  • 5 Art: 13, 22, 25, 36, 45, 81, 87
  • 6 Law: 24, 34, 35, 42, 52, 55, 60, 62, 67, 72
  • 7 Media: 14, 39, 85, 88, 80, ,81, 87
  • 8 Education: 31, 44, 73, 76, 77, 82, 86, 90
  • 9 Environment: 8, 58, 75, 78, 89
In [23]:
model.get_topics()
Out[23]:
{-1: [('we', 0.022246328484310914),
  ('jaar', 0.014825560535742755),
  ('mensen', 0.014233405312292684),
  ('wel', 0.013011375234743543),
  ('intelligentie', 0.01268669676667743),
  ('kunstmatige', 0.012182137624903732),
  ('zegt', 0.012007260891035156),
  ('nieuwe', 0.011391891814129043),
  ('gaat', 0.010897747908243225),
  ('moeten', 0.01054144716387557)],
 0: [('robot', 0.040747440280944944),
  ('we', 0.027470876669336355),
  ('mensen', 0.019451643909996912),
  ('auto', 0.019356006306047474),
  ('intelligentie', 0.018192291740770892),
  ('kunstmatige', 0.016748266185515403),
  ('men', 0.014375584320160421),
  ('wel', 0.013622172170076222),
  ('jaar', 0.01286155054987591),
  ('nieuwe', 0.012393326194674798)],
 1: [('china', 0.028164840461059135),
  ('europese', 0.022046840430905077),
  ('we', 0.019394168050648072),
  ('europa', 0.017627095347032613),
  ('chinese', 0.014903966734085599),
  ('moeten', 0.014246470556257248),
  ('nederland', 0.014052009316309339),
  ('bedrijven', 0.014033153058774818),
  ('jaar', 0.012712776676954494),
  ('ai', 0.012300241160117346)],
 2: [('bedrijf', 0.020442955453832214),
  ('microsoft', 0.019825020816631674),
  ('google', 0.019515951347540167),
  ('chatgpt', 0.018515860464228668),
  ('chip', 0.01836989055946684),
  ('jaar', 0.01697677405734086),
  ('facebook', 0.01685282738055883),
  ('apple', 0.0160507727582351),
  ('asml', 0.015606122255538464),
  ('intelligentie', 0.014374516867028383)],
 3: [('jaar', 0.023184081686227948),
  ('banken', 0.01761774364528032),
  ('bedrijven', 0.016916932469397013),
  ('bank', 0.01666501615891372),
  ('geld', 0.015093450809297216),
  ('beleggers', 0.014860654563712003),
  ('bedrijf', 0.014705946804651981),
  ('we', 0.01390273161684443),
  ('nieuwe', 0.013110427064134428),
  ('gaat', 0.012000621949861656)],
 4: [('we', 0.021930547602218842),
  ('zegt', 0.014452921571927594),
  ('mensen', 0.014238437413663402),
  ('patiënten', 0.01378244587698588),
  ('wel', 0.013054381290297448),
  ('jaar', 0.012210726807541672),
  ('kunstmatige', 0.01197661314466977),
  ('intelligentie', 0.011900529992663087),
  ('patiënt', 0.011619452284720125),
  ('gaat', 0.010728236808870091)],
 5: [('film', 0.01847755447647912),
  ('the', 0.016350241210528083),
  ('we', 0.01543026679492675),
  ('muziek', 0.012888694380478011),
  ('wel', 0.012334602999807629),
  ('intelligentie', 0.012275216231019716),
  ('kunstmatige', 0.011649908596628874),
  ('jaar', 0.011306595788701196),
  ('werk', 0.011084767144725113),
  ('wereld', 0.010643657791755752)],
 6: [('we', 0.019330371749468046),
  ('foto', 0.01711297029532976),
  ('wapens', 0.016277234436833318),
  ('drone', 0.015328767286442876),
  ('mensen', 0.01504924850381791),
  ('zegt', 0.014326810144608688),
  ('kunstmatige', 0.013447552842361231),
  ('intelligentie', 0.013419206847327513),
  ('maken', 0.013239187128046752),
  ('wel', 0.01293068359901015)],
 7: [('spel', 0.018312580198690025),
  ('we', 0.017890999492087493),
  ('computer', 0.017318754694953524),
  ('game', 0.016736522741384936),
  ('mensen', 0.01566625183046286),
  ('schaken', 0.01417092688063822),
  ('intelligentie', 0.013726519506834602),
  ('zegt', 0.01339559450882493),
  ('jaar', 0.01325947556191445),
  ('kunstmatige', 0.013200768323543527)],
 8: [('studenten', 0.029645865440638606),
  ('universiteiten', 0.02785055605586397),
  ('onderwijs', 0.026812243473693962),
  ('we', 0.020442958389444876),
  ('chinese', 0.017319446072270553),
  ('universiteit', 0.01653292988117574),
  ('zegt', 0.01483222235448319),
  ('china', 0.014413560905648978),
  ('huawei', 0.014151140263779867),
  ('wel', 0.014024755186748915)],
 9: [('we', 0.028868686909911203),
  ('dieren', 0.027081379669085564),
  ('jaar', 0.014316680112669084),
  ('zegt', 0.013786552606789631),
  ('waar', 0.012236350448078267),
  ('gaat', 0.012000136818076594),
  ('wel', 0.011432472326371021),
  ('zien', 0.011080247028019128),
  ('natuur', 0.010665189256561906),
  ('data', 0.010562430487446218)]}
In [24]:
#Associates the assigned topics and their probabilities with each news article in the DataFrame
data['Topic'] = topics
data['Probabilities'] = probabilities
In [25]:
topics = data['Topic'].unique()
print(topics)
[ 9 69 -1 86  8 28  0 24  1 25  4 61  2 21 46 74 22 75 35 53 12 13 29 39
 89 56 54 44 36 32 50 42 45 11 10  7 59 18 23  5 57 31 66 68 20 62 64 16
 84 15 37  3 88 51 85 67 90 33 30 48 65 27 72 40 41 55  6 60 26 70 14 38
 19 49 34 58 76 71 83 80 81 47 77 52 79 17 63 73 43 78 87 82]
  • -1 AI in society: -1
  • 0 Technology: 0, 2, 9, 27, 29, 53, 57, 65, 84
  • 1 Politics: 5, 10, 11, 21, 28, 40, 41, 48, 49, 66, 69, 70, 74, 79, 23
  • 2 Business: 4, 17, 18, 20, 26, 30, 32, 37, 38, 46, 61, 63, 71, 83
  • 3 Economy: 3, 6, 15, 16, 43, 47, 51, 64, 68
  • 4 Health: 1, 12, 19, 33, 50, 54, 56, 59
  • 5 Art: 13, 22, 25, 36, 45, 81, 87
  • 6 Law: 24, 34, 35, 42, 52, 55, 60, 62, 67, 72
  • 7 Media: 14, 39, 85, 88, 80, 81, 87
  • 8 Education: 31, 44, 73, 76, 77, 82, 86, 90
  • 9 Environment: 8, 58, 75, 78, 89

-1 = AI in society: -1 0 = 1: 0, 2: 0, 9: 0, 27: 0, 29: 0, 53: 0, 57: 0, 65: 0, 84: 0, 1 = 5: 1, 10: 1, 11: 1, 21: 1, 28: 1, 40: 1, 41: 1, 48: 1, 49: 1, 66: 1, 69: 1, 70: 1, 74: 1, 79: 1, 23: 1 2 = 4: 2, 17: 2, 18: 2, 20: 2, 26: 2, 30: 2, 32: 2, 37: 2, 38: 2, 46: 2, 61: 2, 63: 2, 71: 2, 83:2 3 = 3: 3, 6: 3, 15: 3, 16: 3, 43: 3, 47: 3, 51: 3, 64: 3, 68: 3 4 = 1: 4, 12: 4, 19: 4, 33: 4, 50: 4, 54: 4, 56: 4, 59: 4 5 = 13: 5, 22: 5, 25: 5, 36: 5, 45: 5, 81: 5, 87: 5 6 = 24: 6, 34: 6, 35: 6, 42: 6, 52: 6, 55: 6, 60: 6, 62: 6, 67: 6, 72: 6 7 = 14: 7, 39: 7, 85: 7, 88: 7, 80: 7, 81: 7, 87: 7 8 = 31: 8, 44: 8, 73: 8, 76: 8, 77: 8, 82: 8, 86: 8, 90: 8 9 = 8: 9, 58: 9, 75: 9, 78: 9, 89: 9

In [26]:
data_updated = data.copy()
data_updated['Topic'] = data_updated['Topic'].replace({
    1: 0, 2: 0, 9: 0, 27: 0, 29: 0, 53: 0, 57: 0, 65: 0, 84: 0,
    5: 1, 10: 1, 11: 1, 21: 1, 28: 1, 40: 1, 41: 1, 48: 1, 49: 1, 66: 1, 69: 1, 70: 1, 74: 1, 79: 1, 23: 1, 
    4: 2, 17: 2, 18: 2, 20: 2, 26: 2, 30: 2, 32: 2, 37: 2, 38: 2, 46: 2, 61: 2, 63: 2, 71: 2, 83:2,
    3: 3, 6: 3, 15: 3, 16: 3, 43: 3, 47: 3, 51: 3, 64: 3, 68: 3, 
    1: 4, 12: 4, 19: 4, 33: 4, 50: 4, 54: 4, 56: 4, 59: 4,
    13: 5, 22: 5, 25: 5, 36: 5, 45: 5, 81: 5, 87: 5,
    24: 6, 34: 6, 35: 6, 42: 6, 52: 6, 55: 6, 60: 6, 62: 6, 67: 6, 72: 6,
    14: 7, 39: 7, 85: 7, 88: 7, 80: 7, 81: 7, 87: 7,
    31: 8, 44: 8, 73: 8, 76: 8, 77: 8, 82: 8, 86: 8, 90: 8,
    8: 9, 58: 9, 75: 9, 78: 9, 89: 9
})
data_updated
Out[26]:
Headline Publication URL News Outlet Type of News Word Count Body Publication Date Combined Topic Probabilities
0 Nee, kunstmatige intelligentie gaat ons niet u... Trouw, Verdieping; Blz. 4, 5, 2044 words https://advance.lexis.com/api/document?collect... Trouw Verdieping 2044 Welkom in de AI-fabriek serie\nDat kunstmatige... 07-12-2023 nee kunstmatige intelligentie gaat uitroeien w... 0 0.716202
1 Wereldleiders zoeken grip op kunstmatige intel... Trouw, Vandaag; Blz. 6, 528 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 528 Op het Britse landgoed Bletchley Park werden t... 03-11-2023 wereldleiders zoeken grip kunstmatige intellig... 1 1.000000
2 Kunstmatige intelligentie is best bedreigend Trouw, Tijdgeest; Blz. 8, 576 words https://advance.lexis.com/api/document?collect... Trouw Tijdgeest 576 Of kunstmatige intelligentie nuttig is (Tijdge... 13-05-2023 kunstmatige intelligentie best bedreigend kuns... -1 0.000000
3 Mensen zijn een stuk efficiënter dan kunstmati... Trouw, Vandaag; Blz. 3, 741 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 741 De wereld raakte het afgelopen jaar in de ban ... 21-10-2023 mensen stuk efficiënter kunstmatige intelligen... -1 0.000000
4 Bedreigt kunstmatige intelligentie ons godsbeeld? Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... https://advance.lexis.com/api/document?collect... Trouw Religie en Filosofie 1367 Theologisch elftal\n'In het begin was het Woor... 16-12-2022 bedreigt kunstmatige intelligentie godsbeeld t... 8 1.000000
... ... ... ... ... ... ... ... ... ... ... ...
6441 De rauwe realiteit Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad MORGEN 2920 Grootse oplossingen\nDrie stedelijke 'ontwrich... 14-10-2017 rauwe realiteit grootse oplossingen drie stede... -1 0.000000
6442 No Headline In Original Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad PAGINA 114 klinkt als muziek\nDe Walkman, van Sony, is vo... 29-04-2023 no headline original klinkt muziek walkman son... -1 0.000000
6443 Groeten uit het hart van de hightech Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad WEEKEND 2799 Het is zover voor 'onze man in San Francisco'.... 20-08-2016 groeten hart hightech zover man san francisco ... -1 0.000000
6444 De complete lijst Jonge Talenten 2019 Het Financieele Dagblad, FD PERSOONLIJK; Arbei... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad FD PERSOONLIJK; Arbeidsmarkt 8007 Rebel werkte zes jaar bij zakenbank Morgan Sta... 17-01-2019 complete lijst jonge talenten rebel werkte z j... 1 0.880587
6445 No Headline In Original Het Financieele Dagblad, DE WERELD; Blz. 30, 9... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad DE WERELD 969 The Conversation (Londen)Gates Notes (VS)The E... 08-12-2018 no headline original the conversation londen g... -1 0.000000

6446 rows × 11 columns

In [27]:
# Define the topic name mapping dictionary
topic_names = {
    -1: 'AI in Society',
    0: 'Technology',
    1: 'Politics',
    2: 'Business',
    3: 'Economy',
    4: 'Healthcare',
    5: 'Art',
    6: 'Law',
    7: 'Media',
    8: 'Education',
    9: 'Environment'
}

# Create a new column 'Topic Name' by mapping the 'Topic' column using the topic_names dictionary
data_updated['Topic Name'] = data_updated['Topic'].map(topic_names)

data_updated
Out[27]:
Headline Publication URL News Outlet Type of News Word Count Body Publication Date Combined Topic Probabilities Topic Name
0 Nee, kunstmatige intelligentie gaat ons niet u... Trouw, Verdieping; Blz. 4, 5, 2044 words https://advance.lexis.com/api/document?collect... Trouw Verdieping 2044 Welkom in de AI-fabriek serie\nDat kunstmatige... 07-12-2023 nee kunstmatige intelligentie gaat uitroeien w... 0 0.716202 Technology
1 Wereldleiders zoeken grip op kunstmatige intel... Trouw, Vandaag; Blz. 6, 528 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 528 Op het Britse landgoed Bletchley Park werden t... 03-11-2023 wereldleiders zoeken grip kunstmatige intellig... 1 1.000000 Politics
2 Kunstmatige intelligentie is best bedreigend Trouw, Tijdgeest; Blz. 8, 576 words https://advance.lexis.com/api/document?collect... Trouw Tijdgeest 576 Of kunstmatige intelligentie nuttig is (Tijdge... 13-05-2023 kunstmatige intelligentie best bedreigend kuns... -1 0.000000 AI in Society
3 Mensen zijn een stuk efficiënter dan kunstmati... Trouw, Vandaag; Blz. 3, 741 words https://advance.lexis.com/api/document?collect... Trouw Vandaag 741 De wereld raakte het afgelopen jaar in de ban ... 21-10-2023 mensen stuk efficiënter kunstmatige intelligen... -1 0.000000 AI in Society
4 Bedreigt kunstmatige intelligentie ons godsbeeld? Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... https://advance.lexis.com/api/document?collect... Trouw Religie en Filosofie 1367 Theologisch elftal\n'In het begin was het Woor... 16-12-2022 bedreigt kunstmatige intelligentie godsbeeld t... 8 1.000000 Education
... ... ... ... ... ... ... ... ... ... ... ... ...
6441 De rauwe realiteit Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad MORGEN 2920 Grootse oplossingen\nDrie stedelijke 'ontwrich... 14-10-2017 rauwe realiteit grootse oplossingen drie stede... -1 0.000000 AI in Society
6442 No Headline In Original Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad PAGINA 114 klinkt als muziek\nDe Walkman, van Sony, is vo... 29-04-2023 no headline original klinkt muziek walkman son... -1 0.000000 AI in Society
6443 Groeten uit het hart van de hightech Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad WEEKEND 2799 Het is zover voor 'onze man in San Francisco'.... 20-08-2016 groeten hart hightech zover man san francisco ... -1 0.000000 AI in Society
6444 De complete lijst Jonge Talenten 2019 Het Financieele Dagblad, FD PERSOONLIJK; Arbei... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad FD PERSOONLIJK; Arbeidsmarkt 8007 Rebel werkte zes jaar bij zakenbank Morgan Sta... 17-01-2019 complete lijst jonge talenten rebel werkte z j... 1 0.880587 Politics
6445 No Headline In Original Het Financieele Dagblad, DE WERELD; Blz. 30, 9... https://advance.lexis.com/api/document?collect... Het Financieele Dagblad DE WERELD 969 The Conversation (Londen)Gates Notes (VS)The E... 08-12-2018 no headline original the conversation londen g... -1 0.000000 AI in Society

6446 rows × 12 columns

In [28]:
# Check if all topics have been proper labeled
topics = data_updated['Topic'].unique()
print(topics)
[ 0  1 -1  8  9  6  4  5  2  7  3]

Save the DataFrame to an Excel file¶

data_updated.to_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Updated_Data_And_Topics.xlsx', index=False)

Save the DataFrame to a CSV file¶

data_updated.to_csv('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Updated_Data_And_Topics.csv', index=False)

In [30]:
# Calculate the frequency of topics
topic_frequency = data_updated['Topic'].value_counts()

for topic, frequency in topic_frequency.items():
    print("Topic: ", topic)
    print("Frequency: ", frequency)
    print()
Topic:  -1
Frequency:  2498

Topic:  0
Frequency:  821

Topic:  1
Frequency:  595

Topic:  2
Frequency:  594

Topic:  3
Frequency:  476

Topic:  4
Frequency:  406

Topic:  6
Frequency:  279

Topic:  7
Frequency:  251

Topic:  5
Frequency:  231

Topic:  8
Frequency:  151

Topic:  9
Frequency:  144

In [31]:
# Get the topic labels and frequencies
labels = topic_frequency.index
counts = topic_frequency.values

# Color palette adjusted to the specific colors used in the line graph
color_palette = ['brown', 'darkorange', 'green', 'red', 'purple', 'royalblue', 'yellow', 'darkcyan', 'slategray', 'lightseagreen', 'hotpink']

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(counts, labels=labels, autopct='%1.1f%%', colors=color_palette)
plt.title('Topic Frequency')

# Display the pie chart
plt.show()
In [32]:
# Get topics per source
topics_per_source = model.topics_per_class(content_titles, classes=data_updated['News Outlet'])

# Visualize topics per source
model.visualize_topics_per_class(topics_per_source)